This is a walkthrough on how to recreate the hematopoiesis visualizations from Figure 2 of our Cell Systems paper.

Load the required libraries and data. The hematopoiesis data (from Paul et al) can be downloaded here.

library(Seurat)
## Loading required package: ggplot2
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
## Loading required package: Matrix
library(swne)
## 
## Attaching package: 'swne'
## The following object is masked from 'package:Seurat':
## 
##     ExtractField
library(monocle)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:Matrix':
## 
##     colMeans, colSums, rowMeans, rowSums, which
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## Loading required package: DDRTree
## Loading required package: irlba
library(ggplot2)

## Load monocle2 object
load("~/swne/Data/hemato_data/hemato_monocle_orig_cds.RData")

## Load counts and informative genes
counts <- ReadData("~/swne/Data/hemato_data/hemato_expr_debatched.tsv", make.sparse = T)
info.genes <- scan("~/swne/Data/hemato_data/hemato_info_genes.txt", sep = "\n", what = character())

## Load clusters
clusters.df <- read.table("~/swne/Data/hemato_data/hemato_cluster_mapping.csv", sep = ",")
clusters <- clusters.df[[2]]; names(clusters) <- clusters.df[[1]]
counts <- counts[,names(clusters)]

Filter genes and cells and make Seurat object

counts <- FilterData(counts, min.samples.frac = 0.005, trim = 0.005, min.nonzero.features = 200)
info.genes <- info.genes[info.genes %in% rownames(counts)]
dim(counts)
## [1] 8653 2730
se.obj <- CreateSeuratObject(counts)

Set cluster names, exclude lymphoid cells and dendritic cells (which are not part of the developmental trajectory), set cluster colors.

se.obj <- SetIdent(se.obj, cells.use = names(clusters), clusters)
se.obj@ident <- plyr::revalue(se.obj@ident, 
                              c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery',
                                "6" = 'Ery', "7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
                                "11" = 'DC', "12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M',
                                "16" = 'Neu', "17" = 'Neu', "18" = 'Eos', "19" = 'lymphoid'))
se.obj <- SubsetData(se.obj, ident.remove = c("lymphoid", "DC"))

clusters <- se.obj@ident; names(clusters) <- se.obj@cell.names;
cluster_colors <- c("Bas" = "#ff6347", "Eos" = "#EFAD1E", 
                    "Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859", 
                    "GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")

Scale data, run PCA, t-SNE, and UMAP, and build the SNN.

## Scale data
se.obj@data <- ScaleCounts(se.obj@raw.data[,se.obj@cell.names], method = "log")
## calculating variance fit ... using gam
se.obj@scale.data <- se.obj@data - Matrix::rowMeans(se.obj@data)

## Run PCA
se.obj <- RunPCA(se.obj, pc.genes = info.genes, do.print = F, pcs.compute = 40)
PCElbowPlot(se.obj, num.pc = 40)

## Run t-SNE, UMAP, and SNN
se.obj <- RunTSNE(se.obj, dims.use = 1:15)
se.obj <- RunUMAP(se.obj, reduction.use = "pca", dims.use = 1:15)
se.obj <- BuildSNN(se.obj, dims.use = 1:15, k.param = 40, prune.SNN = 0.0, force.recalc = T)

Identify number of factors to use for SWNE

k.range <- seq(2,20,2) ## Range of factor values to test
n.cores <- 16 ## Number of cores to use

norm.counts <- se.obj@data[,names(clusters)]
k.err <- FindNumFactors(norm.counts[info.genes,], k.range = k.range, n.cores = n.cores, 
                        seed = 32590, do.plot = F)

PlotFactorSelection(k.err, font.size = 15)

Run SWNE

k <- 12
nmf.res <- RunNMF(norm.counts[info.genes,], k = k, init = "ica", n.cores = n.cores, ica.fast = F) ## Run NMF
nmf.scores <- nmf.res$H

nmf.res$W <- ProjectFeatures(norm.counts, nmf.res$H, n.cores = n.cores) ## Project all genes onto NMF

## Run SWNE embedding
swne.embedding <- EmbedSWNE(nmf.scores, SNN = se.obj@snn, alpha.exp = 1.5, snn.exp = 0.1, n_pull = 3,
                            dist.use = "cosine")
## Initial stress        : 0.09063
## stress after  10 iters: 0.03130, magic = 0.461
## stress after  20 iters: 0.02789, magic = 0.500
## stress after  30 iters: 0.02786, magic = 0.500

Identify key factors and embed key genes. See Table S1 from the paper for how these factors were named and key genes selected.

swne.embedding <- RenameFactors(swne.embedding, name.mapping = c("factor_4" = "Erythrocyte differentiation",
                                                                 "factor_5" = "Metal binding",
                                                                 "factor_9" = "Epigenetic regulation",
                                                                 "factor_10" = "HSC maintenance",
                                                                 "factor_11" = "Inflammation"))
genes.embed <- c("Apoe", "Mt2", "Gpr56", "Sun2", "Flt3")
swne.embedding <- EmbedFeatures(swne.embedding, nmf.res$W, genes.embed, n_pull = 3)

Make SWNE plot

PlotSWNE(swne.embedding, alpha.plot = 0.4, sample.groups = clusters, do.label = T,
         label.size = 3.5, pt.size = 2, show.legend = F) +
  scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

Make t-SNE plot

## tSNE plots
tsne.scores <- GetCellEmbeddings(se.obj, reduction.type = "tsne")
PlotDims(tsne.scores, sample.groups = clusters, show.legend = F, show.axes = F,
         alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
  scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

Make UMAP plot

umap.scores <- GetCellEmbeddings(se.obj, reduction.type = "umap")
PlotDims(umap.scores, sample.groups = clusters, show.legend = F, show.axes = F,
         alpha.plot = 0.5, label.size = 3.5, pt.size = 1.5) +
  scale_color_manual(values = cluster_colors)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

Extract pre-computed pseudotime (computed using Monocle2)

pseudotime <- cds$Pseudotime; names(pseudotime) <- colnames(cds@reducedDimS);
pseudotime <- pseudotime[names(clusters)]

SWNE pseudotime plot

FeaturePlotSWNE(swne.embedding, pseudotime, alpha.plot = 0.4, label.size = 3.5, pt.size = 1.5)

t-SNE pseudotime plot

FeaturePlotDims(tsne.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)

UMAP pseudotime plot

FeaturePlotDims(umap.scores, pseudotime, alpha.plot = 0.4, show.axes = F, pt.size = 1.5)

Now let’s compute a quantitative evaluation of SWNE, tSNE, and UMAP as well as other embeddings.

First we need to compute those other embeddings, including dmaps, MDS, Isomap, LLE, and PCA

library(destiny)
dm <- DiffusionMap(t(as.matrix(norm.counts[info.genes,])), k = 20, n_eigs = 2)
diff.scores <- dm@eigenvectors; rownames(diff.scores) <- colnames(norm.counts);

library(RDRToolbox)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
isomap.scores <- Isomap(t(as.matrix(norm.counts[info.genes,])), dims = 2, k = 20)$dim2
## Computing distance matrix ... done
## Building graph with shortest paths (using 20 nearest neighbours) ... done
## Computing low dimensional embedding ... done
## number of samples: 2669
## reduction from 2978 to 2 dimensions
## number of connected components in graph: 1
rownames(isomap.scores) <- colnames(norm.counts)

lle.scores <- LLE(t(as.matrix(norm.counts[info.genes,])), dim = 2, k = 20)
## Computing distance matrix ... done
## Computing low dimensional emmbedding (using 20 nearest neighbours)... done
rownames(lle.scores) <- colnames(norm.counts)

mds.scores <- cmdscale(dist(t(as.matrix(norm.counts[info.genes,]))), k = 2)
pc.scores <- GetCellEmbeddings(se.obj, dims.use = 1:2)

embeddings <- list(swne = t(as.matrix(swne.embedding$sample.coords)),
                   tsne = t(tsne.scores),
                   pca = t(pc.scores),
                   lle = t(lle.scores),
                   mds = t(mds.scores),
                   isomap = t(isomap.scores),
                   dmaps = t(diff.scores),
                   umap = t(umap.scores))

Next we define some helper functions that will help us evaluate these embeddings

library(FNN)
library(proxy)
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:destiny':
## 
##     as.matrix
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Calculate approximate kNN for an embedding
ComputeKNN <- function(emb, k) {
  knn.idx <- knn.index(t(emb), k = k)
  knn.matrix <- matrix(0, ncol(emb), ncol(emb))
  for (i in 1:nrow(knn.idx)) {
    knn.matrix[knn.idx[i,],i] <- 1
    knn.matrix[i, knn.idx[i,]] <- 1
  }
  rownames(knn.matrix) <- colnames(knn.matrix) <- colnames(emb)
  as(knn.matrix, "dgCMatrix")
}


## Calculate Jaccard similarities
CalcJaccard <- function(x,y) {
  a <- sum(x)
  b <- sum(y)
  c <- sum(x == 1 & y == 1)
  c/(a + b - c)
}


## Function for identifying cells in the same path and time step
GetPathStep <- function(metadata, step.size, make.factor = T) {
  path.step <- as.character(metadata$Group); names(path.step) <- rownames(metadata);
  for (path in levels(factor(metadata$Group))) {
    steps <- sort(unique(subset(metadata, Group == path)$Step))
    step.range <- seq(min(steps), max(steps), step.size)
    for(i in step.range) {
      cells.step <- rownames(subset(metadata, Group == path & Step %in% seq(i, i + step.size - 1, 1)))
      path.step[cells.step] <- paste(path, i, sep = ".")
    }
  }
  if (make.factor) {
    path.step <- factor(path.step)
  }
  path.step
}


## Calculate pairwise distances between centroids
CalcPairwiseDist <- function(data.use, clusters, dist.method = "euclidean") {
  data.centroids <- t(apply(data.use, 1, function(x) tapply(x, clusters, mean)))
  return(proxy::dist(data.centroids, method = dist.method, by_rows = F))
}

Compute how well each embedding maintains local structure compared to the original gene expression space by comparing kNN networks.

n.neighbors <- 40
ref.knn <- ComputeKNN(norm.counts[info.genes,], k = n.neighbors)

## Compute kNN for embeddings
embeddings.knn <- lapply(embeddings, ComputeKNN, k = n.neighbors)

knn.simil <- sapply(embeddings.knn, function(knn.emb) {
  mean(sapply(1:ncol(knn.emb), function(i) CalcJaccard(knn.emb[,i], ref.knn[,i])))
})

Compute how well each embedding maintains global strucutre by computing the centroids of each trajectory-cluster grouping in the embedding space and original gene expression space and correlating the pairwise distances between the centroids.

metadata.df <- data.frame(Group = clusters, Step = order(pseudotime[names(clusters)]))
clusters.steps <- GetPathStep(metadata.df, step.size = 50, make.factor = T)
traj.dist <- CalcPairwiseDist(norm.counts[info.genes,], clusters.steps)

embeddings.cor <- sapply(embeddings, function(emb) {
  emb.dist <- CalcPairwiseDist(emb, clusters.steps)
  cor(traj.dist, emb.dist)
})

Plot how well each embedding maintaings global vs local structure.

library(ggplot2)
library(ggrepel)

scatter.df <- data.frame(x = knn.simil, y = embeddings.cor, name = names(embeddings))
ggplot(scatter.df, aes(x, y)) + geom_point(size = 2, alpha = 1) +
  theme_classic() + theme(legend.position = "none", text = element_text(size = 16)) +
  xlab("Neighborhood Score") + ylab("Path-Time Distance Correlation") +
  geom_text_repel(aes(x, y, label = name), size = 5) +
  xlim(0, max(knn.simil)) + ylim(0, max(embeddings.cor))